## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [conflicted] Will prefer dplyr::filter over any other package.
## [conflicted] Will prefer dplyr::select over any other package.
## [conflicted] Will prefer dplyr::slice over any other package.
## [conflicted] Will prefer dplyr::rename over any other package.
## [conflicted] Will prefer dplyr::intersect over any other package.
mytheme = theme_bw(base_size = 10) +
theme(text = element_text(size=10, colour='black'),
title = element_text(size=10, colour='black'),
line = element_line(size=0.5),
axis.title = element_text(size=10, colour='black'),
axis.text = element_text(size=10, colour='black'),
axis.line = element_line(colour = "black"),
axis.ticks = element_line(size=0.5),
strip.background = element_blank(),
strip.text.y = element_blank(),
panel.grid = element_blank(),
panel.border = element_blank(),
legend.position = c(0.8, 0.8),
legend.text=element_text(size=10)) ## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1560 Columns: 48
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 29 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group
## dbl (16): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val$tool = factor(val$tool, levels = c("circseq_cup", "CIRI2", "CIRIquant", "CircSplice", "find_circ", "CirComPara2", "CIRCexplorer3", "circtools", "Sailfish-cir", "NCLscan", "NCLcomparator", "PFv2", "ecircscreen", "KNIFE", "circRNA_finder", "segemehl"))
all_circfirst how many per group in selected circ?
nr_detected = all_circ %>%
group_by(chr, start, end, circ_id, cell_line) %>%
summarise(n_detected = n()) %>%
ungroup()## `summarise()` has grouped output by 'chr', 'start', 'end', 'circ_id'. You can
## override using the `.groups` argument.
val_df = cq %>% left_join(nr_detected) %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, n_detected, cell_line) %>% unique() %>%
pivot_longer(cols = c(qPCR_val, RR_val, amp_seq_val), names_to = "val_type", values_to = "val")## Joining with `by = join_by(chr, start, end, circ_id, cell_line, n_detected)`
val_df$val_type = factor(val_df$val_type, levels = c('qPCR_val', 'RR_val', 'amp_seq_val'))
val_df %>%
ggplot(aes(n_detected, fill = val)) +
geom_bar() +
facet_grid(~val_type) +
mytheme +
scale_fill_manual(values = c('#CC79A7', '#00A875' ,'#999999')) +
ylab('number of circRNAs') +
xlab('number of tools the circRNAs was detected by')## Warning: Removed 54 rows containing non-finite values (`stat_count()`).
## Rows: 675 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool_1, tool_2, cell_line, count_group, tool_lt_1, tool_lt_2, tool_...
## dbl (9): nr_union, nr_intersection, perc_compound_val_1, perc_compound_val_2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
union_sub = simple_union %>%
# filter based on percentage increase
filter((nr_union - pmax(total_n_1, total_n_2))/total_cell_line > 0.0436) %>% # 1000 circ 0.0872
#filter(nr_union - pmax(total_n_1, total_n_2) > 999) %>%
filter(perc_compound_val_1 >= 0.9,
perc_compound_val_2 >= 0.9)add individual tools of interest
union_sub = union_sub %>%
bind_rows(simple_union %>%
filter(tool_1 == tool_2,
tool_1 %in% (union_sub %>% pull(tool_1, tool_2) %>% unique())))only keep one point for each combo
as percentage of all circ in that cell line
union_sub %>%
left_join(all_circ %>% select(circ_id, cell_line, count_group) %>%
filter(count_group == "count ≥ 5") %>%
unique() %>% count(cell_line) %>% rename(total_n = n)) %>%
mutate(perc_union = nr_union/total_n) %>%
#filter(cell_line == "HLF") %>%
ggplot(aes(w_val_rate, perc_union)) +
geom_point(aes(color = (tool_1 == tool_2))) +
geom_text_repel(aes(label=str_remove(tool_combo, '-'), color = (tool_1 == tool_2)), max.overlaps = 20, size = 3) +
mytheme +
theme(legend.position = 'NA') +
facet_wrap(~cell_line, nrow = 2, scales = 'free') +
scale_color_manual(values = c('#E69F00', '#0072B2')) +
scale_x_continuous(labels = scales::percent_format()) +
scale_y_continuous(labels = scales::percent_format()) +
xlab('(weighted) compound precision value') +
ylab('% of all predicted circRNAs')## Joining with `by = join_by(cell_line)`
#ggsave('../tmp_figures/figure_4B.pdf', width = 12, height = 10, units = "cm")
#ggsave('../tmp_figures/sup_figure_33.pdf', width = 20, height = 20, units = "cm")
# union_sub %>%
# left_join(all_circ %>% select(circ_id, cell_line, count_group) %>%
# filter(count_group == "count ≥ 5") %>%
# unique() %>% count(cell_line) %>% rename(total_n = n)) %>%
# mutate(perc_union = nr_union/total_n) %>%
# write_tsv('../tmp_figures/source_data_fig_4B.txt')check mean increase in number of circ (percentage)
simple_union %>%
filter(count_group == "count ≥ 5",
!tool_1 == tool_2,
perc_compound_val_1 >= 0.9,
perc_compound_val_2 >= 0.9) %>%
mutate(increase = nr_union - total_n_1,
increase_prec = increase/total_n_1) %>% #view()
pull(increase_prec) %>%
quantile() ## 0% 25% 50% 75% 100%
## 0.00000000 0.07247976 0.31844063 0.98775803 17.90560472